#Goal of document: Apply the Poisson Log-Normal Mixed Model of the cytoeffect package to the Palgen HIV dataset (2019).

#cytoeffect - 3 parts: A) logistic linear mixed model (LLMM); B) poisson log-normal mixed model (PLMM); This is part B.

Prerequisites

###Parse input parameters.

ncells = 3000 #later can set to Inf (infinite)
seed = 0xdada
ncores = parallel::detectCores()
cytof_data = "se_palgen2019vaccine.Rdata"
prefit_PLMM = paste0("cytoeffect_plmm_ncells_",ncells,".Rdata")
prefit_PLMM
## [1] "cytoeffect_plmm_ncells_3000.Rdata"

###Install packages.

pkgs_needed = c("devtools","tidyverse","magrittr","SummarizedExperiment",
                "ggthemes","cowplot","RColorBrewer","broom","hexbin",
                "intergraph","igraph","ggnetwork","ggcorrplot","MASS",
                "parallel","dplyr")
letsinstall = setdiff(pkgs_needed, installed.packages())
if (length(letsinstall) > 0) {
  BiocManager::install(letsinstall)
}
devtools::install_github("ChristofSeiler/cytoeffect")

###Load packages.

library("cytoeffect")
library("tidyverse")
library("magrittr")
library("SummarizedExperiment")
library("ggthemes")
library("cowplot")
library("RColorBrewer")
library("broom")
library("intergraph")
library("igraph")
library("ggnetwork")
library("ggcorrplot")
library("MASS")
library("parallel")
library("dplyr")
theme_set(theme_few())

Load Data

###Load SummarizedExperiment object from CytoGLMM workflow.

rdata_filenames = c(cytof_data, prefit_PLMM)
load(cytof_data)
exprs = assay(se_palgen2019vaccine)
sample_info = rowData(se_palgen2019vaccine)
sample_info_names = names(sample_info)
df_samples = cbind(as.data.frame(exprs), as.data.frame(sample_info))
df_samples %<>% as.tibble
protein_names = colData(se_palgen2019vaccine) %>% 
  as.data.frame %>% 
  dplyr::filter(type != "unused") %>%
  .$protein_name

Fit Model

###Tally cell count.

df_samples %>% group_by(term,donor) %>% tally %>% arrange(n)
## # A tibble: 8 x 3
## # Groups:   term [2]
##   term       donor      n
##   <fct>      <chr>  <int>
## 1 Post-prime BD620  93624
## 2 Post-prime BB078 163789
## 3 Post-boost BD620 171421
## 4 Post-prime BC641 172985
## 5 Post-boost BB078 206027
## 6 Post-prime BB231 228105
## 7 Post-boost BC641 264225
## 8 Post-boost BB231 330010
df_samples %>% group_by(term,donor) %>% tally %>% arrange(desc(n))
## # A tibble: 8 x 3
## # Groups:   term [2]
##   term       donor      n
##   <fct>      <chr>  <int>
## 1 Post-boost BB231 330010
## 2 Post-boost BC641 264225
## 3 Post-prime BB231 228105
## 4 Post-boost BB078 206027
## 5 Post-prime BC641 172985
## 6 Post-boost BD620 171421
## 7 Post-prime BB078 163789
## 8 Post-prime BD620  93624

###Subsample cells to a maximum number of cells per donor.

if(nrow(df_samples) > ncells) {
  print(paste("subsampled to",ncells,"per donor"))
  set.seed(seed)
  # subsample depending on max cell count
  df_count = df_samples %>% group_by(donor) %>% tally() %>%
    mutate(nnew = ifelse(n > ncells,ncells,n))
  # create table with a data frame in one column
  df_nested = df_samples %>% group_by(donor) %>% nest() %>%
    left_join(df_count,by = "donor")
  # subsample per donor
  df_samples = df_nested %>%
    mutate(samp = map2(data, nnew, sample_n)) %>%
    dplyr::select(donor, samp) %>%
    unnest()
} else {
  print("no subsampling done")
}
## [1] "subsampled to 3000 per donor"

###Tally cell count.

df_samples %>% group_by(term,donor) %>% tally %>% 
  arrange(n)
## # A tibble: 8 x 3
## # Groups:   term [2]
##   term       donor     n
##   <fct>      <chr> <int>
## 1 Post-prime BD620  1024
## 2 Post-prime BB231  1215
## 3 Post-prime BC641  1242
## 4 Post-prime BB078  1333
## 5 Post-boost BB078  1667
## 6 Post-boost BC641  1758
## 7 Post-boost BB231  1785
## 8 Post-boost BD620  1976
df_samples %>% group_by(term,donor) %>% tally %>% 
  arrange(desc(n))
## # A tibble: 8 x 3
## # Groups:   term [2]
##   term       donor     n
##   <fct>      <chr> <int>
## 1 Post-boost BD620  1976
## 2 Post-boost BB231  1785
## 3 Post-boost BC641  1758
## 4 Post-boost BB078  1667
## 5 Post-prime BB078  1333
## 6 Post-prime BC641  1242
## 7 Post-prime BB231  1215
## 8 Post-prime BD620  1024

HMC Sampling

Sample from posterior distribution (initialization details in paper).

df_samples$term %<>% factor(levels = c("Post-prime",
                                              "Post-boost"))
if(file.exists(prefit_PLMM)) {
  load(file = prefit_PLMM)
} else {
  obj = cytoeffect::poisson_lognormal(df_samples, protein_names, 
                                      condition = "term", group = "donor",
                                      iter = 325, warmup = 200, 
                                      num_chains = ncores)
  save(obj,file = prefit_PLMM)
}

HMC Diagnostics

Postprocessing of posterior samples. Traceplot of posterior samples.

ERROR - doesn’t look right - maybe just too many for space? 7/05 NOTE: successful poisson sampling; 62 post-sampling graphs, none show the usual colours, just numbers on top of each other –> too many graphs for the space, when subset shows correctly

rstan::traceplot(obj$fit_mcmc, inc_warmup = FALSE, pars = c("beta[1,1]", "beta[1,2]", "beta[2,1]", "beta[2,2]", "beta[3,1]", "beta[3,2]", "beta[4,1]", "beta[4,2]", "beta[5,1]", "beta[5,2]", "beta[6,1]", "beta[6,2]", "beta[7,1]", "beta[7,2]", "beta[8,1]", "beta[8,2]", "beta[9,1]", "beta[9,2]", "beta[10,1]", "beta[10,2]", "beta[11,1]", "beta[11,2]", "beta[12,1]", "beta[12,2]", "beta[13,1]", "beta[13,2]", "beta[14,1]", "beta[14,2]", "beta[15,1]", "beta[15,2]"))

rstan::traceplot(obj$fit_mcmc, inc_warmup = FALSE, pars = c("beta[16,1]", "beta[16,2]", "beta[17,1]", "beta[17,2]", "beta[18,1]", "beta[18,2]", "beta[19,1]", "beta[19,2]", "beta[20,1]", "beta[20,2]", "beta[21,1]", "beta[21,2]", "beta[22,1]", "beta[22,2]", "beta[23,1]", "beta[23,2]", "beta[24,1]", "beta[24,2]", "beta[25,1]", "beta[25,2]", "beta[26,1]", "beta[26,2]", "beta[27,1]", "beta[27,2]", "beta[28,1]", "beta[28,2]", "beta[29,1]", "beta[29,1]", "beta[30,1]", "beta[30,2]", "beta[31,1]", "beta[31,2]"))

Some more MCMC diagnostics. According to empirically findings, Rhat > 1.1 is usually indicative of problems in the fit.

pars = c("beta",
         "sigma","sigma_term","sigma_donor",
         "Cor","Cor_term","Cor_donor", "b_donor") 
#b_donor seems to be newly added
tb = summary(obj$fit_mcmc, 
             pars = pars)$summary %>% 
  as.tibble(rownames = "pars", .before = 1) %>% 
  dplyr::select(pars, n_eff, Rhat)
tb %<>% na.omit() # Stan fills upper triangle with zeros
tb %>% arrange(n_eff)
## # A tibble: 3,131 x 3
##    pars           n_eff  Rhat
##    <chr>          <dbl> <dbl>
##  1 Cor_term[1,4]   12.9  6.92
##  2 Cor_term[4,1]   12.9  6.92
##  3 Cor_term[1,24]  13.0  5.98
##  4 Cor_term[24,1]  13.0  5.98
##  5 Cor_term[1,2]   13.1  5.61
##  6 Cor_term[2,1]   13.1  5.61
##  7 Cor_term[1,26]  13.1  5.55
##  8 Cor_term[26,1]  13.1  5.55
##  9 Cor[1,4]        13.7  5.04
## 10 Cor[4,1]        13.7  5.04
## # … with 3,121 more rows
tb %>% arrange(desc(Rhat))
## # A tibble: 3,131 x 3
##    pars           n_eff  Rhat
##    <chr>          <dbl> <dbl>
##  1 Cor_term[1,4]   12.9  6.92
##  2 Cor_term[4,1]   12.9  6.92
##  3 Cor_term[1,24]  13.0  5.98
##  4 Cor_term[24,1]  13.0  5.98
##  5 Cor_term[1,2]   13.1  5.61
##  6 Cor_term[2,1]   13.1  5.61
##  7 Cor_term[1,26]  13.1  5.55
##  8 Cor_term[26,1]  13.1  5.55
##  9 Cor[1,24]       13.7  5.08
## 10 Cor[24,1]       13.7  5.08
## # … with 3,121 more rows
tb %>% summarize(min = min(n_eff), max = max(n_eff))
## # A tibble: 1 x 2
##     min   max
##   <dbl> <dbl>
## 1  12.9 2520.
tb %>% summarize(min = min(Rhat), max = max(Rhat))
## # A tibble: 1 x 2
##     min   max
##   <dbl> <dbl>
## 1 0.992  6.92

Results

###Plot posterior regression coefficients.

p1 = plot(obj, type = "beta") + 
  ggtitle(expression("Fixed Effects"~beta)) +
  theme(legend.position = "bottom") +
  guides(col = guide_legend(ncol = 1)) + 
  scale_color_few()
p1

plot(obj, type = "beta") + 
  facet_wrap(~condition, scales = "free_x") +
  theme(legend.position = "bottom") +
  guides(col = guide_legend(ncol = 1)) + 
  scale_color_few()

Comment: Graphs look very different from Seiler’s output. Seiler has most of his markers positive, of the order of magnitude of 1 to 3 and wih very small SD (whiskers). In this case, the graphs, are mostly negative for post-prime and post-boost isolated, with many around -8 but with much larger SD then Seiler’s. Most do not differ significantly between post prime and post boost. CXCR4 seems to be the only one whose tails do not cross the zero line of the post-boost - post-prime graph.

###Extract expected count difference for CD66

post_beta = rstan::extract(obj$fit_mcmc, pars = "beta")[[1]]

prime_index = which(levels(pull(obj$df_samples, obj$condition)) 
                    == "Post-prime")
boost_index = which(levels(pull(obj$df_samples, obj$condition)) 
                    == "Post-boost")
cd66_index = which(obj$protein_names == "CD66")
prime_log_count = quantile(post_beta[,cd66_index,prime_index], 
                           probs = c(0.025, 0.5, 0.975))
prime_log_count
##       2.5%        50%      97.5% 
## -0.9078138 -0.6160139 -0.3578553
exp(prime_log_count)
##      2.5%       50%     97.5% 
## 0.4034052 0.5400930 0.6991743
boost_log_count = quantile(post_beta[,cd66_index,boost_index], 
                           probs = c(0.025, 0.5, 0.975))
boost_log_count
##       2.5%        50%      97.5% 
## -0.3870309 -0.1003416  0.1464098
exp(boost_log_count)
##      2.5%       50%     97.5% 
## 0.6790701 0.9045284 1.1576705
diff_log_count = quantile(
  post_beta[,cd66_index,boost_index] - post_beta[,cd66_index,prime_index], 
  probs = c(0.025, 0.5, 0.975))
diff_log_count
##      2.5%       50%     97.5% 
## 0.4692777 0.5166285 0.5661941
exp(diff_log_count)
##     2.5%      50%    97.5% 
## 1.598839 1.676366 1.761550

Comment: something wrong: claims all categories are NA; in the past when running regression purely on functional proteins there was output here.

###Posterior multivariate pairs plot.

NOTE TO SELF: From heatmap of all markers and their differences between prime and boost - CD66 had a strong difference (seems has no whiskers in plots earlier), and CD45 (with perforine) also looked strongly different - just curious to look how it seems (and see if commands work)

CD66_index = which(obj$protein_names == "CD66")
IFNg_index = which(obj$protein_names == "IFNg")
CD45_index = which(obj$protein_names == "CD45")

post_beta = rstan::extract(obj$fit_mcmc, pars = "beta")[[1]]
tb_log_count = bind_rows(
  tibble(
    term = levels(pull(obj$df_samples, obj$condition))[1],
    CD66 = post_beta[,CD66_index,1],
    IFNg = post_beta[,IFNg_index,1],
    CD45 = post_beta[,CD45_index,1]
  ),
  tibble(
    term = levels(pull(obj$df_samples, obj$condition))[2],
    CD66 = post_beta[,CD66_index,2],
    IFNg = post_beta[,IFNg_index,2],
    CD45 = post_beta[,CD45_index,2]
  )
)
plot_diag = function(marker) {
  ggplot(tb_log_count, aes_string(marker, fill = "term")) + 
    geom_histogram(bins = 40, position = "identity", alpha = 0.5) +
    scale_fill_few()
}
plot_off_diag = function(marker1, marker2) {
  ggplot(tb_log_count, aes_string(marker1, marker2, color = "term")) +
    geom_density2d() + 
    scale_color_few()
}
ppair = plot_grid(
  plot_diag("CD66") + theme(legend.position = "none"),
  NULL, 
  NULL,
  plot_off_diag("CD66","IFNg") + theme(legend.position = "none"), 
  plot_diag("IFNg") + theme(legend.position = "none"), 
  NULL,
  plot_off_diag("CD66","CD45") + theme(legend.position = "none"), 
  plot_off_diag("IFNg","CD45") + theme(legend.position = "none"), 
  plot_diag("CD45") + theme(legend.position = "none"),
  ncol = 3
)
plot_grid(ppair,
          get_legend(plot_diag("CD66") + theme(legend.position = "bottom")),
          ncol = 1,
          rel_heights = c(1, .1))

ggsave(filename = "posterior_multivariate_plmm_3000.pdf", width = 8, height = 6)

###Plot posterior standard deviation.

p2 = plot(obj, type = "sigma") + 
  ggtitle("Marker Standard Deviation"~sigma) +
  theme(legend.position = "bottom") +
  guides(col = guide_legend(ncol = 1)) +
  scale_color_manual(values=c("#5DA5DA", "#FAA43A", "#F17CB0"))
p2

COMMENT: A line of blue that covers the orange dots all at post-prime and post-boost. Donor variations are very large, up to almost 2500 for protein CD8.

###Plot posterior correlations.

plist = plot(obj, type = "Cor")
plist
## [[1]]

## 
## [[2]]

## 
## [[3]]

Comment: All marker correlation tables (post-prime, post-boost, and donor) seem to not correlate at all. Table seems virtually white.

###Multivariate posterior MDS plots.

cytoeffect::plot_mds(obj)

ggsave(filename = "posterior_mds_latent_variable_mu_scaled_3000.pdf", width = 10, height = 5)
cytoeffect::plot_mds(obj, asp = FALSE)

ggsave(filename = "posterior_mds_latent_variable_mu_unscaled_3000.pdf", width = 10, height = 5)

This plot further confirms what has been shown throughout this analysis. Post-prime and post-boost do not differ significantly. They are clearly overlapped. The proteins are all clustered and not easy to read. The MDS1 only accounts for 13.4% which is relatively small.

###Pairwise correlation change between conditions.

marker_pair = c("TNFa","IFNg")
Cor = rstan::extract(obj$fit_mcmc, pars = "Cor")[[1]]
Cor_term = rstan::extract(obj$fit_mcmc, pars = "Cor_term")[[1]]
Cor_diff = Cor_term - Cor
tb_cor = Cor_diff[,
                  which(obj$protein_names == marker_pair[1]),
                  which(obj$protein_names == marker_pair[2])] %>% as.tibble
tb_cor %<>% mutate(
  side = if_else(tb_cor$value > 0, 
                 true = paste0("positive (", 100*mean(tb_cor$value > 0), "%)"),
                 false = paste0("negative (", 100*mean(tb_cor$value <= 0), "%)"))
)
# keep colors consistent
if(mean(tb_cor$value > 0) == 1) {
  fill_colors = "#E46726"
} else {
  fill_colors = c("#6D9EC1","#E46726")
}
ggplot(tb_cor, aes(value, fill = side)) + 
  geom_histogram(bins = 50, alpha = 0.7) +
  xlab(paste0("Cor_term(", paste(marker_pair, collapse = ", "),")" )) +
  ggtitle("Posterior Distribution") + 
  scale_fill_manual(values = fill_colors)

#try different pairs
marker_pair = c("CD66","CD45")
tb_cor = Cor_diff[,
                  which(obj$protein_names == marker_pair[1]),
                  which(obj$protein_names == marker_pair[2])] %>% as.tibble
tb_cor %<>% mutate(
  side = if_else(tb_cor$value > 0, 
                 true = paste0("positive (", 100*mean(tb_cor$value > 0), "%)"),
                 false = paste0("negative (", 100*mean(tb_cor$value <= 0), "%)"))
)
ggplot(tb_cor, aes(value, fill = side)) + 
  geom_histogram(bins = 50, alpha = 0.7) +
  xlab(paste0("Cor_term(", paste(marker_pair, collapse = ", "),")" )) +
  ggtitle("Posterior Distribution") + 
  scale_fill_manual(values = fill_colors)

Comment: Seiler’s posterior distribution entirely 100% positive

###Check if overall correlation structure changes between conditions.

value = sapply(1:nrow(Cor_diff), function(i) {
  mask = which(upper.tri(Cor_diff[i,,]), arr.ind = T)
  cord = Cor_diff[i,,]
  mean(cord[lower.tri(cord)] > 0)
})
tb_cor = tibble(value = value)
tb_cor %<>% mutate(
  side = if_else(tb_cor$value > 0.5, 
                 true = paste0("> 1/2 (", 100*mean(tb_cor$value > 0.5), "%)"),
                 false = paste0("<= 1/2 (", 100*mean(tb_cor$value <= 0.5), "%)"))
)
p_global = ggplot(tb_cor, aes(value, fill = side)) + 
  geom_histogram(bins = 25, alpha = 0.7) +
  ggtitle(expression("Overall P(Corr"~Omega~"(3rd) > Corr"~Omega~"(1st))")) +
  scale_fill_manual(values = fill_colors) +
  theme(legend.position = "bottom") +
  xlab("probability")
p_global

###Plot differential correlations.

cor_increase = apply(X = Cor_diff, MARGIN = c(2,3), FUN = function(x) mean(x > 0))
colnames(cor_increase) = rownames(cor_increase) = obj$protein_names
p_local = ggcorrplot(cor_increase, hc.order = TRUE, type = "lower",
           outline.col = "lightgray",
           colors = c("#6D9EC1", "white", "#E46726")) +
  ggtitle(expression("P(Corr"~Omega~"(3rd) > Corr"~Omega~"(1st))")) +
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank()) +
  scale_fill_gradient2(limit = c(0, 1), midpoint = 0.5,
                       low = "#6D9EC1", mid =  "white", high = "#E46726",
                       name = "probability")
p_local

###Plot graph with edges at least 95% probability of larger correlation.

plot_correlation_graph = function(lambda) {
  graph = apply(Cor_diff, c(2,3), function(x) mean(x>0))
  diag(graph) = 0
  graph[upper.tri(graph)] = 0
  ind = which(graph > 0, arr.ind = T)
  tb_graph = tibble(
    from = obj$protein_names[ind[,1]],
    to = obj$protein_names[ind[,2]],
    prob = graph[ind]
  )
  tb_graph %<>% dplyr::filter(prob > lambda)
  tb_graph
  bayesFDR = sum(1-tb_graph$prob)/nrow(tb_graph)*100
  bayesFDR
  set.seed(0xdada)
  ig = graph_from_data_frame(tb_graph, directed = FALSE)
  ggplot(ggnetwork(ig, layout = "circle"), aes(x, y, xend = xend, yend = yend)) + 
    geom_edges(color = "black", size = 1) +
    geom_nodes(color = "black", size = 20) + 
    geom_nodetext(aes(label = vertex.names), 
                  color = "white", size = 3, fontface = "bold") +
    xlim(c(-0.1, 1.1)) +
    ylim(c(-0.1, 1.1)) + 
    ggtitle(paste0("Posterior Expected FDR: ", 
                   round(bayesFDR, digits = 1),"%")) +
    theme_blank() +
    theme(plot.title = element_text(hjust = 0.5))
}
plot_correlation_graph(lambda = 0.665)

plot_correlation_graph(lambda = 0.65)

plot_correlation_graph(lambda = 0.62)

#when lambda too high (> 0.7) gives subscript out of bounds.

Interesting to see which proteins correlate more strongly with each other. It should be noted that lambda isn’t so high and that the expected False Discovery Rate (FDR) is very hihg at around 30-35%. The strictest lambda (if smaller, an error occurs), highlights four proteins, CD11c is linked to CD107a, and CD11a and CD14 are linked. None of these proteins have been of note in other analyses. With lambda = 0.65, CD66 and CD107a come out being linked to 2 proteins each. CD66 has been an important protein throughout but CD107a has not been particularly of note so far. With lambda = 0.62, more proteins are included, and the ones with three links are CD66, CD45, HLADR, CD107a, and NKG2D.

###Combine plot for paper.

pall = plot_grid(
  p1, p2, 
  plist[[1]] + ggtitle(expression("Marker Corr"~Omega~"(Post-prime)")),
  plist[[2]] + ggtitle(expression("Marker Corr"~Omega~"(Post-boost)")),
  p_global, p_local, 
  rel_heights = c(0.38,0.31,0.31),
  nrow = 3, labels = "AUTO"
)
ggsave(plot = pall, 
       filename = "posterior_summary_plmm_3000.pdf", 
       width = 8, height = 11)

Goodness of Fit

Define a test statistics and compare observed value with posterior predictive distribution.

###Predictive distribution marginalized over cell random effects.

stan_pars = rstan::extract(obj$fit_mcmc, 
                           pars = c("beta",
                                    "sigma","sigma_term","sigma_donor",
                                    "Cor","Cor_term","Cor_donor"))
condition = "term"
term = obj$df_samples %>%
    pull(condition) %>%
    as.factor() %>%
    as.integer()
conditions_levels = levels(pull(obj$df_samples, 
                                obj$condition))
# kth posterior draw
sample_y_hat = function(k = 1) {
  set.seed(seed)
  lapply(1:2, function(cond) {
    n_cells_cond = table(term)[cond]
    beta = stan_pars$beta[k,,]
    mu = rep(0, length(protein_names))
    beta_rep = sapply(beta[,cond], rep, n_cells_cond)
    if(cond == 1) {
      sigma = stan_pars$sigma[k,]
      Cor = stan_pars$Cor[k,,]
    } else {
      sigma = stan_pars$sigma_term[k,]
      Cor = stan_pars$Cor_term[k,,]
    }
    Cov = diag(sigma) %*% Cor %*% diag(sigma)
    b = mvrnorm(n = n_cells_cond, mu, Cov)
    sigma_donor = stan_pars$sigma_donor[k,]
    Cor_donor = stan_pars$Cor_donor[k,,]
    Cov_donor = diag(sigma_donor) %*% Cor_donor %*% diag(sigma_donor)
    b_donor = mvrnorm(n = n_cells_cond, mu, Cov_donor)
    count = exp(beta_rep + b + b_donor)
    count = matrix(rpois(length(count), count),
                   nrow = nrow(count), 
                   ncol = ncol(count))
    count %<>% as.tibble
    names(count) = protein_names
    count %<>% add_column(term  = conditions_levels[cond])
    count
  }) %>% bind_rows()
}
Y_hat = sample_y_hat(k = 1)
Y_hat %>% 
  group_by(term) %>% 
  summarize_at(protein_names, median)
## # A tibble: 2 x 32
##   term   CD66 HLADR   CD3 CD107a   CD8  CD45   IL4 GranzymeB  CD56 CD62L
##   <chr> <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <int>     <dbl> <dbl> <dbl>
## 1 Post…     1     0     0      0     0     0    NA         0     0     0
## 2 Post…     0     0     0      0     0     0    NA         0     0     0
## # … with 21 more variables: CD4 <int>, CD11a <dbl>, CD2 <dbl>, CD7 <dbl>,
## #   MIP1B <dbl>, TNFa <dbl>, Ki67 <int>, NKG2D <dbl>, CD11c <dbl>,
## #   CD69 <dbl>, IFNg <dbl>, CD25 <dbl>, CD16 <dbl>, CCR5 <dbl>,
## #   CXCR4 <dbl>, CD14 <dbl>, perforine <dbl>, NKG2A <int>, CD20 <dbl>,
## #   CCR7 <dbl>, IL10 <dbl>

The only one that is not O or NA is CD66 with post-boost being 1 and post-prime equal to 0.

Check if we can model CD66, perforine, and CD45 bright cells.

gof = function(df, test_stat) {
  tfm = function(x) asinh(x/5)
  df_tfm = df %>% mutate_at(protein_names, tfm)
  df_median = df_tfm %>% summarize_at(protein_names, median)
  tibble(
    term = c(
      "Post-prime", 
      "Post-boost"
      ),
    statistic = c(
      test_stat(df_tfm %>% filter(term == "Post-prime"), df_median), 
      test_stat(df_tfm %>% filter(term == "Post-boost"), df_median)
      )
  )
}
test_stat_a = function(df_tfm, df_median) {
  mean(df_tfm$CD66 > df_median$CD66 & 
       df_tfm$perforine > df_median$perforine & 
       df_tfm$CD45 > df_median$CD45) * 100
}
gof_obsv_a = gof(df_samples, test_stat_a)
gof_pred_a = mclapply(1:dim(stan_pars$beta)[1], function(k) gof(sample_y_hat(k), test_stat_a),
                      mc.cores = ncores) %>% bind_rows() 
gof_obsv_a %<>% mutate(subset = "Cell Subset A")
gof_pred_a %<>% mutate(subset = "Cell Subset A")

Check if we can model CD66 bright, and perforine and CD45 dim cells.

test_stat_b = function(df_tfm, df_median) {
 mean(df_tfm$CD66 > df_median$CD66 & 
       df_tfm$perforine < df_median$perforine & 
       df_tfm$CD45 < df_median$CD45) * 100
}
gof_obsv_b = gof(df_samples, test_stat_b)
gof_pred_b = mclapply(1:dim(stan_pars$beta)[1], function(k) gof(sample_y_hat(k), test_stat_b),
                      mc.cores = ncores) %>% bind_rows()
gof_obsv_b %<>% mutate(subset = "Cell Subset B")
gof_pred_b %<>% mutate(subset = "Cell Subset B")

Check if we can model zero CD56 and CD11a bright cells.

test_stat_c = function(df_tfm, df_median) {
  mean(df_tfm$CD56  == 0 & 
       df_tfm$CD11a > df_median$CD11a) * 100
}
gof_obsv_c = gof(df_samples, test_stat_c)
gof_pred_c = mclapply(1:dim(stan_pars$beta)[1], function(k) gof(sample_y_hat(k), test_stat_c),
                      mc.cores = ncores) %>% bind_rows()
gof_obsv_c %<>% mutate(subset = "Cell Subset C")
gof_pred_c %<>% mutate(subset = "Cell Subset C")

Check if we can model nonzero CD56 and CD11a bright cells.

test_stat_d = function(df_tfm, df_median) {
 mean(df_tfm$CD56 > 0 & 
       df_tfm$CD11a > df_median$CD11a) * 100
}
gof_obsv_d = gof(df_samples, test_stat_d)
gof_pred_d = mclapply(1:dim(stan_pars$beta)[1], function(k) gof(sample_y_hat(k), test_stat_d),
                      mc.cores = ncores) %>% bind_rows()
gof_obsv_d %<>% mutate(subset = "Cell Subset D")
gof_pred_d %<>% mutate(subset = "Cell Subset D")

Combined plot for paper.

# combine observed statistic
gof_obsv_all = bind_rows(gof_obsv_a, gof_obsv_b, gof_obsv_c, gof_obsv_d)
# combined predicted statistic
gof_pred_all = bind_rows(gof_pred_a, gof_pred_b, gof_pred_c, gof_pred_d)
# plot everything
ggplot(gof_pred_all, aes(statistic, fill = term)) + 
  geom_histogram(bins = 40, position = "identity", alpha = 0.5) +
  geom_vline(data = gof_obsv_all, linetype = "dashed", size = 1, 
             aes(xintercept = statistic, color = term)) +
  scale_fill_few() + 
  scale_color_few() + 
  xlab("test statistic (percentage)") + 
  facet_wrap(~subset, scales = "free") + 
  theme(legend.position="bottom")

ggsave(filename = "goodness_of_fit.pdf", 
       width = 8, height = 5)

#save
save(gof_obsv_all, file = "PLMM_gof_obs_3000.Rdata")
save(gof_pred_all, file = "PLMM_gof_pred_3000.Rdata")

Session Info

sessionInfo()
## R version 3.5.2 (2018-12-20)
## Platform: x86_64-redhat-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/R/lib/libRblas.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] sna_2.4                     network_1.15               
##  [3] statnet.common_4.2.0        MASS_7.3-51.4              
##  [5] ggcorrplot_0.1.2            ggnetwork_0.5.1            
##  [7] igraph_1.2.4.1              intergraph_2.0-2           
##  [9] broom_0.5.2                 RColorBrewer_1.1-2         
## [11] cowplot_0.9.4               ggthemes_4.2.0             
## [13] SummarizedExperiment_1.12.0 DelayedArray_0.8.0         
## [15] BiocParallel_1.16.6         matrixStats_0.54.0         
## [17] Biobase_2.42.0              GenomicRanges_1.34.0       
## [19] GenomeInfoDb_1.18.2         IRanges_2.16.0             
## [21] S4Vectors_0.20.1            BiocGenerics_0.28.0        
## [23] magrittr_1.5                forcats_0.4.0              
## [25] stringr_1.4.0               dplyr_0.8.0.1              
## [27] purrr_0.3.2                 readr_1.3.1                
## [29] tidyr_0.8.3                 tibble_2.1.1               
## [31] ggplot2_3.1.1               tidyverse_1.2.1            
## [33] cytoeffect_0.2.0            Rcpp_1.0.1                 
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-140           bitops_1.0-6           lubridate_1.7.4       
##  [4] progress_1.2.1         httr_1.4.0             rstan_2.18.2          
##  [7] tools_3.5.2            backports_1.1.4        utf8_1.1.4            
## [10] R6_2.4.0               lazyeval_0.2.2         colorspace_1.4-1      
## [13] withr_2.1.2            tidyselect_0.2.5       gridExtra_2.3         
## [16] prettyunits_1.0.2      processx_3.3.1         compiler_3.5.2        
## [19] cli_1.1.0              rvest_0.3.3            xml2_1.2.0            
## [22] labeling_0.3           scales_1.0.0           checkmate_1.9.3       
## [25] callr_3.2.0            rappdirs_0.3.1         digest_0.6.18         
## [28] StanHeaders_2.18.1     rmarkdown_1.12         XVector_0.22.0        
## [31] pkgconfig_2.0.2        htmltools_0.3.6        rlang_0.3.4           
## [34] readxl_1.3.1           rstudioapi_0.10        generics_0.0.2        
## [37] jsonlite_1.6           inline_0.3.15          RCurl_1.95-4.12       
## [40] GenomeInfoDbData_1.2.0 loo_2.1.0              Matrix_1.2-17         
## [43] fansi_0.4.0            munsell_0.5.0          stringi_1.4.3         
## [46] yaml_2.2.0             zlibbioc_1.28.0        pkgbuild_1.0.3        
## [49] plyr_1.8.4             grid_3.5.2             ggrepel_0.8.1         
## [52] crayon_1.3.4           lattice_0.20-38        haven_2.1.0           
## [55] hms_0.4.2              batchtools_0.9.11      zeallot_0.1.0         
## [58] knitr_1.22             ps_1.3.0               pillar_1.4.0          
## [61] base64url_1.4          reshape2_1.4.3         glue_1.3.1            
## [64] evaluate_0.13          data.table_1.12.2      modelr_0.1.4          
## [67] vctrs_0.1.0            cellranger_1.1.0       gtable_0.3.0          
## [70] assertthat_0.2.1       xfun_0.6               coda_0.19-2           
## [73] brew_1.0-6

References